Importar datos
# Read a csv file data and name the object
mc_mode_choice <- read_csv(system.file("extdata", "mc_commute.csv", package = "discrtr"), show_col_types = FALSE)
head(mc_mode_choice)
## # A tibble: 6 × 39
## RespondentID choice avcycle avwalk avhsr avcar timecycle timewalk accesshsr
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 566872636 3 0 1 1 0 6.21 21.3 3
## 2 566873140 3 0 1 1 1 3.73 12.8 4
## 3 566874266 3 0 0 1 1 100000 100000 3
## 4 566874842 2 1 1 1 0 5.83 20 10.7
## 5 566881170 2 1 1 1 0 5.83 20 6.21
## 6 566907438 2 0 1 1 0 100000 10 7.14
## # ℹ 30 more variables: waitingtimehsr <dbl>, transfer <dbl>, timehsr <dbl>,
## # timecar <dbl>, parking <dbl>, vehind <dbl>, owncycle <dbl>, gender <dbl>,
## # work <dbl>, visa <dbl>, age <dbl>, solo <dbl>, shared <dbl>, family <dbl>,
## # child <dbl>, primary_caregiver <dbl>, LAT <dbl>, LONG <dbl>, DAUID <dbl>,
## # mhi <dbl>, dwell_den <dbl>, lum <dbl>, st_den <dbl>, inter_den <dbl>,
## # SF_P_ratio <dbl>, side_den <dbl>, Shelters_SD <dbl>, Shelters_D <dbl>,
## # Shelters_A <dbl>, Shelters_SA <dbl>
Glimpse
glimpse(mc_mode_choice)
## Rows: 1,376
## Columns: 39
## $ RespondentID <dbl> 566872636, 566873140, 566874266, 566874842, 56688117…
## $ choice <dbl> 3, 3, 3, 2, 2, 2, 2, 3, 3, 2, 2, 4, 2, 2, 3, 2, 4, 3…
## $ avcycle <dbl> 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ avwalk <dbl> 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1…
## $ avhsr <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ avcar <dbl> 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0…
## $ timecycle <dbl> 6.211180e+00, 3.726708e+00, 1.000000e+05, 5.828157e+…
## $ timewalk <dbl> 21.31439, 12.78863, 100000.00000, 20.00000, 20.00000…
## $ accesshsr <dbl> 3.00, 4.00, 3.00, 10.66, 6.21, 7.14, 10.66, 15.00, 2…
## $ waitingtimehsr <dbl> 15.00, 15.00, 2.00, 10.23, 10.23, 10.23, 10.23, 3.00…
## $ transfer <dbl> 0e+00, 0e+00, 0e+00, 0e+00, 0e+00, 0e+00, 0e+00, 1e+…
## $ timehsr <dbl> 5, 10, 15, 8, 5, 3, 20, 25, 8, 5, 5, 25, 2, 3, 25, 5…
## $ timecar <dbl> 100000, 2, 4, 100000, 100000, 100000, 5, 17, 4, 1000…
## $ parking <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0…
## $ vehind <dbl> 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ owncycle <dbl> 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0…
## $ gender <dbl> 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0…
## $ work <dbl> 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0…
## $ visa <dbl> 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ age <dbl> 21, 23, 20, 20, 19, 19, 49, 19, 20, 23, 25, 38, 20, …
## $ solo <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ shared <dbl> 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0…
## $ family <dbl> 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1…
## $ child <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1…
## $ primary_caregiver <dbl> 1e+05, 1e+05, 0e+00, 1e+05, 1e+05, 1e+05, 1e+00, 0e+…
## $ LAT <dbl> 43.26302, 43.25885, 43.25222, 43.25782, 43.25562, 43…
## $ LONG <dbl> -79.90074, -79.90476, -79.93953, -79.91941, -79.9204…
## $ DAUID <dbl> 35250503, 35250675, 35250964, 35250669, 35250669, 35…
## $ mhi <dbl> 3.3902, 4.5770, 6.3081, 5.4911, 5.4911, 9.8697, 6.30…
## $ dwell_den <dbl> 941.3980, 1688.5725, 534.6675, 892.1744, 892.1744, 3…
## $ lum <dbl> 0.805636, 0.280830, 0.455743, 0.479460, 0.479460, 0.…
## $ st_den <dbl> 14.376206, 19.497536, 13.556608, 14.307826, 14.30782…
## $ inter_den <dbl> 39.224916, 109.529025, 15.276213, 45.883253, 45.8832…
## $ SF_P_ratio <dbl> 0.230931, 0.356169, 0.074477, 0.268249, 0.268249, 0.…
## $ side_den <dbl> 22.633222, 39.640032, 8.228497, 37.457574, 37.457574…
## $ Shelters_SD <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Shelters_D <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1…
## $ Shelters_A <dbl> 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0…
## $ Shelters_SA <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
Info. de las primeras 5 variables
summary(mc_mode_choice[1:5])
## RespondentID choice avcycle avwalk
## Min. :566872636 Min. :1.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:567814188 1st Qu.:2.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :568682048 Median :2.000 Median :0.0000 Median :1.0000
## Mean :570566454 Mean :2.618 Mean :0.2747 Mean :0.6613
## 3rd Qu.:574925212 3rd Qu.:3.000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :587675235 Max. :4.000 Max. :1.0000 Max. :1.0000
## avhsr
## Min. :0.0000
## 1st Qu.:1.0000
## Median :1.0000
## Mean :0.9608
## 3rd Qu.:1.0000
## Max. :1.0000
Para convertir la elección de variable en un factor hacemos…
mc_mode_choice$choice <- factor(mc_mode_choice$choice,
labels = c("Cycle", "Walk", "HSR", "Car"))
Y se aplica summary
summary(mc_mode_choice$choice)
## Cycle Walk HSR Car
## 48 711 336 281
Para resumir una variable…
summary(mc_mode_choice$timecycle)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.29 3.79 5.83 34014.86 100000.00 100000.00
time.Active.clean <- mc_mode_choice |> dplyr::select(timecycle, timewalk) |>
filter(timecycle!=100000 & timewalk!=100000)
Se obtine un resumen estadÃstico:
summary(time.Active.clean)
## timecycle timewalk
## Min. : 0.2914 Min. : 1.00
## 1st Qu.: 2.9141 1st Qu.:10.00
## Median : 4.3711 Median :15.00
## Mean : 4.5852 Mean :16.10
## 3rd Qu.: 5.8282 3rd Qu.:20.00
## Max. :17.4845 Max. :62.11
Se grafica el resumen estadÃstico
ggplot(data = time.Active.clean) +
geom_area(aes(x = timecycle), stat = "bin", binwidth = 5, fill = "blue", color = "black", alpha = 0.6) +
geom_area(aes(x = timewalk), stat = "bin", binwidth = 5, fill = "yellow", color = "black", alpha = 0.6) +
xlab("Tiempo (minutos)")
Resumen con 2 variables
mc_mode_choice |> select(c("choice", "side_den")) |> summary()
## choice side_den
## Cycle: 48 Min. : 0.00
## Walk :711 1st Qu.:18.19
## HSR :336 Median :22.63
## Car :281 Mean :24.18
## 3rd Qu.:35.70
## Max. :59.41
Se traza la variable categórica
ggplot(mc_mode_choice)+
geom_boxplot(aes(choice, side_den))
Invoke data set Mode from package mlogit. To do this you need to first load the package. This is a data set with choices about mode of transportation. This is done as follows:
data("Mode")
Once you have loaded the data set, answer the following questions: Describe this data set. How many variables are there and of which type (i.e., categorical/quantitative)?
glimpse(Mode)
## Rows: 453
## Columns: 9
## $ choice <fct> car, rail, car, car, car, car, car, car, bus, car, rail, …
## $ cost.car <dbl> 1.5070097, 6.0569985, 5.7946769, 1.8691439, 2.4989523, 4.…
## $ cost.carpool <dbl> 2.3356118, 2.8969191, 2.1374543, 2.5724266, 1.7220099, 0.…
## $ cost.bus <dbl> 1.800512, 2.237128, 2.576385, 1.903518, 2.686000, 1.84765…
## $ cost.rail <dbl> 2.358920, 1.855450, 2.747479, 2.268276, 2.973866, 2.31005…
## $ time.car <dbl> 18.503200, 31.311107, 22.547429, 26.090282, 4.699140, 3.0…
## $ time.carpool <dbl> 26.338233, 34.256956, 23.255171, 29.896023, 12.414084, 9.…
## $ time.bus <dbl> 20.86779, 67.18189, 63.30906, 19.75270, 43.09204, 12.8256…
## $ time.rail <dbl> 30.03347, 60.29313, 49.17164, 13.47268, 39.74325, 43.5442…
How many different modes of transportation are in this data set? What is the most popular mode? What is the least popular mode?
summary(Mode)
## choice cost.car cost.carpool cost.bus
## car :218 Min. :0.4099 Min. :0.1293 Min. :1.013
## carpool: 32 1st Qu.:3.6964 1st Qu.:0.9519 1st Qu.:1.783
## bus : 81 Median :4.8796 Median :1.6665 Median :2.027
## rail :122 Mean :4.8735 Mean :1.6863 Mean :2.036
## 3rd Qu.:6.2255 3rd Qu.:2.4581 3rd Qu.:2.321
## Max. :8.8555 Max. :3.2953 Max. :2.740
## cost.rail time.car time.carpool time.bus
## Min. :1.272 Min. : 2.404 Min. : 8.385 Min. : 1.969
## 1st Qu.:1.947 1st Qu.:21.835 1st Qu.:28.391 1st Qu.:25.457
## Median :2.198 Median :37.497 Median :40.637 Median :41.415
## Mean :2.212 Mean :37.044 Mean :39.771 Mean :39.923
## 3rd Qu.:2.476 3rd Qu.:53.104 3rd Qu.:51.843 3rd Qu.:52.805
## Max. :3.113 Max. :66.871 Max. :65.009 Max. :75.681
## time.rail
## Min. : 4.621
## 1st Qu.:28.143
## Median :40.034
## Mean :39.505
## 3rd Qu.:49.172
## Max. :73.998
ggplot(Mode)+
geom_bar(aes(choice, fill=choice), color="black", show.legend = FALSE)+
theme_bw()
In general, what is the most expensive mode? The least expensive?
ggplot(Mode)+
geom_density(aes(cost.car, fill="car"), color="black", alpha=0.5)+
geom_density(aes(cost.carpool , fill="carpool"), color="black", alpha=0.5)+
geom_density(aes(cost.bus, fill="bus"), color="black", alpha=0.5)+
geom_density(aes(cost.rail, fill="rail"), color="black", alpha=0.5)+
scale_fill_manual(values=c("car"= "firebrick", "carpool"="dodgerblue", "bus"="darkgoldenrod2", "rail"="cyan"), name="Mode")+
xlab("Cost")+
theme_bw()
Create a plot showing the univariate distributions of time by car and time by bus. Discuss.
grafica_time <- ggplot(Mode)+
geom_density(aes(time.car, fill="car"), color="black", alpha=0.5)+
geom_density(aes(time.carpool , fill="carpool"), color="black", alpha=0.5)+
geom_density(aes(time.bus, fill="bus"), color="black", alpha=0.5)+
geom_density(aes(time.rail, fill="rail"), color="black", alpha=0.5)+
scale_fill_manual(values=c("car"= "firebrick", "carpool"="dodgerblue", "bus"="darkgoldenrod2", "rail"="cyan"), name="Mode")+
xlab("Time")+
theme_bw()
ggplotly(grafica_time)
How do choices relate to cost by the different modes?
varnum <- Mode |> dplyr::select(where(is.numeric))
varnum <- names(varnum)
for (var in varnum) {
grafica_box <- ggplot(Mode)+
geom_boxplot(aes(choice, eval(as.name(var)), fill=choice), show.legend=FALSE)+
ylab(var)+
theme_bw()
print(grafica_box)
}
Ejercicios:
4.- Describe los datos del ejercicio.¿Cuántas variables hay y de qué tipo (categoricas o cuantitativas)?
Una variable categórica y 8 cuantitativas en el data set Mode
5.- ¿Cuantos modos diferentes de transportación hay en este ejercicio?, ¿Cuál es el más popular? y ¿Cuál es el menos popular?
Hay cuatro: ° Carro (más popular) ° Compartido ° Autobus ° Caminar (menos popular)
6.- En general, ¿Cuál es el modo más costoso de transpprte y cuál el menos costoso?
El auto es el más costoso y el menos costoso es el auto compartido
7.- Crea un plot mostrando la distribución univariable del tiempo en el auto y autobus.
grafica_time <- ggplot(Mode)+
geom_density(aes(time.car, fill="car"), color="black", alpha=0.5)+
geom_density(aes(time.bus, fill="bus"), color="black", alpha=0.5)+
scale_fill_manual(values=c("car"= "firebrick", "carpool"="dodgerblue", "bus"="darkgoldenrod2", "rail"="cyan"), name="Mode")+
xlab("Time")+
theme_bw()
ggplotly(grafica_time)
En terminos generales el auto es más eficiente en distancias cortas, sin embargo el autobus en trayectos superiores a 45 min (aprox) se vuelve más eficiente
7.- ¿cómo las preferencias se relacionan con el costo en los diferentes modos de transporte?
varnum <- Mode |> dplyr::select(where(is.numeric))
varnum <- names(varnum)
for (var in varnum) {
grafica_box <- ggplot(Mode)+
geom_boxplot(aes(choice, eval(as.name(var)), fill=choice), show.legend=FALSE)+
ylab(var)+
theme_bw()
print(grafica_box)
}